1 Data

1.1 Data inspection and variables

# Load data
songs <- read.csv("songs.csv") %>%
  mutate(genre = as.factor(genre))
head(songs[,-1])
str(songs)
## 'data.frame':    6611 obs. of  15 variables:
##  $ track_id        : chr  "2DB4DdfCFMw1iaR6JaR03a" "3uwnnTQcHM1rDqSfA4gQNz" "5WtfUKzXircvW8l5aaVZWT" "3e7sxremeOE3wTySiOhGiP" ...
##  $ name            : chr  "Bam Bam (feat. Ed Sheeran)" "Cool for the Summer" "What A Time (feat. Niall Horan)" "Dusk Till Dawn (feat. Sia) - Radio Edit" ...
##  $ artists         : chr  "Camila Cabello" "Demi Lovato" "Julia Michaels" "ZAYN" ...
##  $ genre           : Factor w/ 6 levels "EDM","Hip pop",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ danceability    : num  0.756 0.583 0.516 0.259 0.573 0.488 0.559 0.694 0.803 0.262 ...
##  $ energy          : num  0.697 0.61 0.467 0.437 0.739 0.343 0.559 0.712 0.585 0.606 ...
##  $ key             : int  8 5 0 11 0 4 11 7 8 8 ...
##  $ loudness        : num  -6.38 -5.64 -6.18 -6.59 -5.74 ...
##  $ mode            : int  1 0 1 0 1 1 1 0 1 1 ...
##  $ speechiness     : num  0.0401 0.0382 0.0302 0.0386 0.129 0.0436 0.0358 0.046 0.0432 0.0484 ...
##  $ acousticness    : num  0.182 0.00425 0.662 0.102 0.0285 0.556 0.0145 0.132 0.103 0.247 ...
##  $ instrumentalness: num  0.00 1.05e-04 0.00 1.32e-06 0.00 0.00 0.00 0.00 3.94e-06 0.00 ...
##  $ liveness        : num  0.333 0.14 0.0853 0.106 0.111 0.21 0.138 0.211 0.0644 0.125 ...
##  $ valence         : num  0.956 0.336 0.386 0.0951 0.451 0.0978 0.338 0.799 0.593 0.275 ...
##  $ tempo           : num  95 114.1 132.9 180 97.1 ...
# Type of variables
vis_dat(songs)

is.na(songs) %>% colSums()
##         track_id             name          artists            genre 
##                0                0                0                0 
##     danceability           energy              key         loudness 
##                0                0                0                0 
##             mode      speechiness     acousticness instrumentalness 
##                0                0                0                0 
##         liveness          valence            tempo 
##                0                0                0

1.2 Data cleaning

# Data cleaning
songs_clean <- songs %>%
  select(-track_id, -name, -artists)
#  mutate(mode = as.factor(mode)) # mode takes only 0 (minor) or 1 (major)
# boxplot of audio features
ggplot(melt(songs_clean[,-c(1,6)]), aes(y = value, fill = variable)) +
  geom_boxplot() +
  facet_wrap(~variable, scales="free") + 
  theme(legend.position="none")

# drop outliers in loundness
songs_clean <- songs_clean %>%
  filter(loudness > -20)

2 Descriptive statistics

2.1 Descriptive statistics

# Summary
as.data.frame(do.call(cbind, lapply(songs_clean[,-1], summary)))

2.2 Balance check

# Number of songs by genre
songs_clean %>% 
  count(genre) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "center")
genre n
EDM 1038
Hip pop 1265
Latin 1027
Pop 1191
R&B 923
Rock 1163
ggplot(data = songs_clean, aes(y=genre)) +
  geom_bar(aes(fill=genre)) +
  theme (plot.title = element_text(hjust = 0.5)) +
  labs(title = "Music Genres - Number of Cases and Percentages",
                 y = "Genres",
                 x = "Count") +
  expand_limits(x = 1450) +
  geom_text(stat='count', 
            aes(label = sprintf('%s (%.1f%%)', 
                after_stat(count), 
                after_stat(count / sum(count) * 100))),
            hjust=ifelse(1.5, -0.1, 1.1)) + 
  scale_fill_brewer(palette="Accent")

3 EDA

3.1 Audio features by genre

feature_names <- names(songs_clean)[c(2:12)]

songs_clean %>%
  select(-mode) %>%
  pivot_longer(cols = feature_names[-5]) %>%
  ggplot(aes(x = value, colour = genre)) +
  geom_density(alpha = 0.3) +
  facet_wrap(~name, ncol = 4, scales = "free") +
  theme (plot.title = element_text(hjust = 0.5)) +
  labs(title = 'Spotify Audio Feature Density by Genre',
       x = '', y = 'Density') +
  theme(axis.text.y = element_blank()) + 
  scale_color_brewer(palette="Accent")

songs_clean %>%
  select(-mode) %>%
  pivot_longer(cols = feature_names[-5]) %>%
  ggplot(aes(y = value, fill = genre)) +
  geom_boxplot() +
  facet_wrap(~name, ncol = 2, scales = "free") +
  theme (plot.title = element_text(hjust = 0.5)) +
  labs(title = 'Spotify Audio Feature Boxplot by Genre',
       x = '', y = '') +
  scale_fill_brewer(palette="Accent")

3.2 Correlation between audio features

songs_clean %>%
  select(feature_names) %>%
  scale() %>%
  cor() %>%
  ggcorrplot(type = "upper", lab = T) + 
  labs(title = "Correlation between Audio Features", x = "", y = "") +
  theme (plot.title = element_text(hjust = 0.5)) +
  theme_gray()
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(feature_names)` instead of `feature_names` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

Energy and loudness are fairly highly correlated (0.65). In this case, loudness will be removed, since energy appears to give more distinction between genre groups (as seen in the density and box plot).

# remove loudness
feature_names_reduced <- feature_names[-4]

songs_clean <- songs_clean %>%
  select(genre, feature_names_reduced)

3.3 Correlation between genres

# Create correlation matrix using median value of features
genre_corr <- songs_clean %>%
  group_by(genre) %>%
  summarise_if(is.numeric, median, na.rm = TRUE) %>%
  ungroup() %>%
  select(-genre) %>%
  scale() %>%
  t() %>%
  as.matrix() %>%
  cor()

colnames(genre_corr) <- levels(songs_clean$genre)
row.names(genre_corr) <- levels(songs_clean$genre)

ggcorrplot(genre_corr, type = "upper", lab = T) +
  labs(title = "Correlation between Genres", x = "", y = "") +
  theme (plot.title = element_text(hjust = 0.5)) +
  theme_gray()

4 Feature selection

4.1 Logistic regression

logit <- glm(genre ~., data = songs_clean, family = binomial)

summary(logit)
## 
## Call:
## glm(formula = genre ~ ., family = binomial, data = songs_clean)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4815   0.0454   0.1722   0.3864   2.8828  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      11.358811   0.573992  19.789  < 2e-16 ***
## danceability     -6.088799   0.398092 -15.295  < 2e-16 ***
## energy           -9.776767   0.437868 -22.328  < 2e-16 ***
## key               0.001991   0.013287   0.150   0.8809    
## mode              0.650747   0.095390   6.822 8.98e-12 ***
## speechiness       3.205314   0.650251   4.929 8.25e-07 ***
## acousticness      3.606148   0.508640   7.090 1.34e-12 ***
## instrumentalness -3.946038   0.276871 -14.252  < 2e-16 ***
## liveness         -0.660562   0.301207  -2.193   0.0283 *  
## valence           5.997461   0.264019  22.716  < 2e-16 ***
## tempo            -0.014599   0.002018  -7.235 4.66e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5746.0  on 6606  degrees of freedom
## Residual deviance: 3055.5  on 6596  degrees of freedom
## AIC: 3077.5
## 
## Number of Fisher Scoring iterations: 7
#stargazer(logit, type = 'html', summary = T, report = "vc*stp", ci=T)

4.2 PCA

songs_scaled <- songs_clean %>%
  mutate_if(is.numeric, scale)

song_cov <- cov(songs_scaled[,feature_names_reduced])
song_eigen <- eigen(song_cov)

data.frame(proporation_of_variance = song_eigen$values/sum(song_eigen$values)) %>%
  mutate(cumulative_prop = cumsum(proporation_of_variance),
         pc = 1:n()) %>%
  ggplot(aes(x = pc, y = cumulative_prop)) + 
  geom_point() + 
  geom_line() +
  ylim(c(0,1)) +
  theme (plot.title = element_text(hjust = 0.5)) +
  labs(title = 'Cumulative Scree Plot', 
       x = 'Principal Component', 
       y = 'Cumulative % of variance explained') 

song_eigenvectors <- song_eigen$vectors[,1:2] * -1
song_eigenvectors <- song_eigenvectors %>%
  as.data.frame() %>%
  mutate(feature = row.names(song_cov)) %>%
  rename('PC1' = 'V1',
         'PC2' = 'V2')

song_eigenvectors %>%
  pivot_longer(cols = c('PC1', 'PC2')) %>%
  ggplot(aes(x = feature, y = value)) + 
  geom_col(aes(fill = feature), position = 'dodge') +
  facet_wrap(~name, ncol = 2) +
  coord_flip() +
  labs(title = 'Principal Component Loadings', x = 'Feature', y = '') + 
  theme(legend.position="none", plot.title = element_text(hjust = 0.5))

PC <- data.frame(
    genre = songs_scaled$genre,
    PC1 = as.matrix(songs_scaled[,feature_names_reduced]) %*% song_eigenvectors[,1], 
    PC2 = as.matrix(songs_scaled[,feature_names_reduced]) %*% song_eigenvectors[,2])

PC %>% 
  ggplot(aes(x = PC1, y = PC2, color = genre)) + 
  geom_point(alpha = 0.6) + 
  facet_wrap(~genre) +
  labs(title = 'Plotting principal components 1 vs 2') +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_brewer(palette="Accent")

5 Classification models

5.1 Data split

set.seed(51211)

songs_clean <- songs_clean %>% 
  mutate(mode = as.factor(mode)) %>% 
  mutate_if(is.numeric, scale)

train_index <- sample(1:nrow(songs_clean), nrow(songs_clean)*.75, replace = F)
trainSet <- songs_clean[train_index,]
testSet <- songs_clean[-train_index,]

5.2 Logistic regression

5.2.1 Basic

multi.log <- multinom(genre ~ danceability + energy + mode + speechiness + 
                        acousticness + instrumentalness + valence + tempo, 
                      data = trainSet)
stargazer(multi.log, type = 'html', summary = T, report = "vc*stp", ci=T)
Dependent variable:
Hip pop Latin Pop R&B Rock
(1) (2) (3) (4) (5)
danceability 0.032 -0.288*** -0.591*** -0.630*** -2.188***
(-0.145, 0.209) (-0.475, -0.101) (-0.759, -0.423) (-0.815, -0.445) (-2.374, -2.001)
t = 0.354 t = -3.022 t = -6.876 t = -6.668 t = -22.955
p = 0.724 p = 0.003 p = 0.000 p = 0.000 p = 0.000
energy -1.929*** -1.081*** -1.642*** -2.557*** -1.507***
(-2.140, -1.718) (-1.297, -0.865) (-1.844, -1.440) (-2.778, -2.337) (-1.720, -1.294)
t = -17.932 t = -9.809 t = -15.924 t = -22.729 t = -13.872
p = 0.000 p = 0.000 p = 0.000 p = 0.000 p = 0.000
mode1 0.273** 0.801*** 0.552*** 0.016 0.826***
(0.013, 0.533) (0.531, 1.070) (0.301, 0.802) (-0.256, 0.288) (0.560, 1.092)
t = 2.059 t = 5.828 t = 4.316 t = 0.116 t = 6.088
p = 0.040 p = 0.000 p = 0.00002 p = 0.908 p = 0.000
speechiness 0.964*** -0.077 0.054 0.113 -1.018***
(0.817, 1.110) (-0.245, 0.091) (-0.104, 0.211) (-0.058, 0.284) (-1.262, -0.775)
t = 12.909 t = -0.896 t = 0.668 t = 1.297 t = -8.205
p = 0.000 p = 0.371 p = 0.505 p = 0.195 p = 0.000
acousticness 0.400*** 1.432*** 0.942*** 0.614*** 0.450***
(0.123, 0.677) (1.160, 1.704) (0.674, 1.209) (0.340, 0.888) (0.169, 0.730)
t = 2.832 t = 10.323 t = 6.907 t = 4.393 t = 3.143
p = 0.005 p = 0.000 p = 0.000 p = 0.00002 p = 0.002
instrumentalness -1.887*** -2.760*** -1.041*** -1.003*** -0.257***
(-2.440, -1.335) (-3.677, -1.843) (-1.287, -0.795) (-1.277, -0.730) (-0.374, -0.140)
t = -6.693 t = -5.899 t = -8.302 t = -7.189 t = -4.292
p = 0.000 p = 0.000 p = 0.000 p = 0.000 p = 0.00002
valence 1.001*** 1.689*** 0.938*** 1.479*** 1.949***
(0.832, 1.170) (1.510, 1.868) (0.775, 1.101) (1.300, 1.657) (1.771, 2.127)
t = 11.580 t = 18.467 t = 11.271 t = 16.261 t = 21.440
p = 0.000 p = 0.000 p = 0.000 p = 0.000 p = 0.000
tempo -0.512*** -0.253*** -0.429*** -0.563*** -0.503***
(-0.660, -0.364) (-0.406, -0.100) (-0.574, -0.284) (-0.718, -0.408) (-0.652, -0.354)
t = -6.782 t = -3.238 t = -5.804 t = -7.129 t = -6.626
p = 0.000 p = 0.002 p = 0.000 p = 0.000 p = 0.000
Constant 0.687*** 0.035 1.144*** 0.873*** 0.018
(0.387, 0.986) (-0.343, 0.413) (0.887, 1.401) (0.605, 1.141) (-0.279, 0.315)
t = 4.498 t = 0.181 t = 8.728 t = 6.390 t = 0.116
p = 0.00001 p = 0.856 p = 0.000 p = 0.000 p = 0.908
Akaike Inf. Crit. 12,108.470 12,108.470 12,108.470 12,108.470 12,108.470
Note: p<0.1; p<0.05; p<0.01
mean(predict(multi.log, type="class", newdata=testSet)==testSet$genre)
## [1] 0.5151332

5.2.2 With natural spline

multi.log.ns <- 
  multinom(genre ~ ns(danceability, df=5) + ns(energy, df=5) + mode + 
           ns(speechiness, df=5) + ns(acousticness, df=5) + ns(instrumentalness, df=5) + 
           ns(valence, df=5) + ns(tempo, df=5), data = songs_clean)
stargazer(multi.log.ns, type = 'html', summary = T, report = "vc*stp", ci=T)
Dependent variable:
Hip pop Latin Pop R&B Rock
(1) (2) (3) (4) (5)
ns(danceability, df = 5)1 -1.545 -1.757* -1.696** -2.601*** -6.091***
(-3.696, 0.607) (-3.767, 0.254) (-3.387, -0.005) (-4.453, -0.748) (-7.540, -4.641)
t = -1.407 t = -1.713 t = -1.966 t = -2.751 t = -8.235
p = 0.160 p = 0.087 p = 0.050 p = 0.006 p = 0.000
ns(danceability, df = 5)2 -1.588 -2.004* -2.375** -3.848*** -8.828***
(-4.016, 0.841) (-4.301, 0.294) (-4.333, -0.418) (-5.985, -1.711) (-10.599, -7.056)
t = -1.281 t = -1.709 t = -2.379 t = -3.530 t = -9.767
p = 0.201 p = 0.088 p = 0.018 p = 0.0005 p = 0.000
ns(danceability, df = 5)3 0.320 0.443 -1.360** -1.741*** -7.074***
(-1.074, 1.715) (-0.909, 1.796) (-2.551, -0.170) (-3.037, -0.445) (-8.507, -5.641)
t = 0.450 t = 0.643 t = -2.239 t = -2.634 t = -9.674
p = 0.653 p = 0.521 p = 0.026 p = 0.009 p = 0.000
ns(danceability, df = 5)4 -2.880 -2.870 -4.604** -8.107*** -16.679***
(-7.879, 2.119) (-7.584, 1.843) (-8.603, -0.605) (-12.493, -3.722) (-20.787, -12.570)
t = -1.129 t = -1.194 t = -2.256 t = -3.623 t = -7.957
p = 0.259 p = 0.233 p = 0.025 p = 0.0003 p = 0.000
ns(danceability, df = 5)5 1.192 0.304 -1.036 -2.702*** -12.937***
(-0.290, 2.675) (-1.253, 1.861) (-2.502, 0.429) (-4.313, -1.092) (-17.196, -8.677)
t = 1.576 t = 0.383 t = -1.386 t = -3.289 t = -5.953
p = 0.115 p = 0.702 p = 0.166 p = 0.002 p = 0.000
ns(energy, df = 5)1 0.689 0.663 -1.818 -3.034 -2.782
(-3.759, 5.136) (-3.737, 5.064) (-6.069, 2.433) (-7.291, 1.222) (-7.078, 1.515)
t = 0.303 t = 0.295 t = -0.838 t = -1.397 t = -1.269
p = 0.762 p = 0.768 p = 0.402 p = 0.163 p = 0.205
ns(energy, df = 5)2 -0.573 0.289 -2.411 -4.231* -3.334
(-5.340, 4.193) (-4.413, 4.991) (-6.947, 2.126) (-8.798, 0.336) (-7.920, 1.251)
t = -0.236 t = 0.121 t = -1.042 t = -1.816 t = -1.425
p = 0.814 p = 0.904 p = 0.298 p = 0.070 p = 0.155
ns(energy, df = 5)3 -2.607* -0.618 -3.477** -5.947*** -4.218***
(-5.504, 0.289) (-3.489, 2.252) (-6.244, -0.710) (-8.773, -3.121) (-7.018, -1.419)
t = -1.764 t = -0.422 t = -2.463 t = -4.124 t = -2.953
p = 0.078 p = 0.673 p = 0.014 p = 0.00004 p = 0.004
ns(energy, df = 5)4 1.536 1.727 -2.965 -5.404 -2.780
(-8.125, 11.196) (-7.817, 11.271) (-12.186, 6.257) (-14.697, 3.888) (-12.097, 6.536)
t = 0.312 t = 0.355 t = -0.630 t = -1.140 t = -0.585
p = 0.756 p = 0.723 p = 0.529 p = 0.255 p = 0.559
ns(energy, df = 5)5 -5.544*** -3.018*** -6.127*** -9.464*** -4.669***
(-7.383, -3.706) (-4.845, -1.191) (-7.891, -4.363) (-11.749, -7.180) (-6.356, -2.983)
t = -5.911 t = -3.238 t = -6.808 t = -8.121 t = -5.427
p = 0.000 p = 0.002 p = 0.000 p = 0.000 p = 0.00000
mode1 0.299** 0.855*** 0.587*** 0.148 0.981***
(0.052, 0.545) (0.598, 1.112) (0.353, 0.820) (-0.105, 0.402) (0.731, 1.230)
t = 2.375 t = 6.531 t = 4.916 t = 1.146 t = 7.711
p = 0.018 p = 0.000 p = 0.00000 p = 0.252 p = 0.000
ns(speechiness, df = 5)1 -0.557 -3.217*** -0.763 -1.328** -3.397***
(-1.792, 0.678) (-4.313, -2.121) (-1.837, 0.310) (-2.418, -0.239) (-4.452, -2.343)
t = -0.884 t = -5.752 t = -1.393 t = -2.389 t = -6.316
p = 0.377 p = 0.000 p = 0.164 p = 0.017 p = 0.000
ns(speechiness, df = 5)2 -0.253 -3.383*** -1.019 -1.644** -3.151***
(-1.695, 1.189) (-4.710, -2.057) (-2.310, 0.271) (-2.967, -0.321) (-4.453, -1.848)
t = -0.344 t = -5.001 t = -1.549 t = -2.435 t = -4.740
p = 0.732 p = 0.00000 p = 0.122 p = 0.015 p = 0.00001
ns(speechiness, df = 5)3 1.046** -2.643*** -0.523 -0.727 -3.373***
(0.006, 2.086) (-3.749, -1.538) (-1.565, 0.518) (-1.842, 0.388) (-4.669, -2.076)
t = 1.971 t = -4.687 t = -0.985 t = -1.278 t = -5.099
p = 0.049 p = 0.00001 p = 0.325 p = 0.202 p = 0.00000
ns(speechiness, df = 5)4 0.098 -7.129*** -2.850** -3.690*** -8.328***
(-2.702, 2.897) (-9.654, -4.604) (-5.322, -0.378) (-6.204, -1.175) (-10.862, -5.793)
t = 0.068 t = -5.533 t = -2.260 t = -2.876 t = -6.440
p = 0.946 p = 0.00000 p = 0.024 p = 0.005 p = 0.000
ns(speechiness, df = 5)5 2.286*** -3.408*** -2.019*** -2.128*** -5.646***
(1.129, 3.443) (-4.783, -2.032) (-3.334, -0.704) (-3.544, -0.712) (-7.667, -3.625)
t = 3.873 t = -4.857 t = -3.009 t = -2.945 t = -5.475
p = 0.0002 p = 0.00001 p = 0.003 p = 0.004 p = 0.00000
ns(acousticness, df = 5)1 -0.499* 0.760** -0.110 0.663** -0.735**
(-1.081, 0.084) (0.109, 1.411) (-0.664, 0.444) (0.011, 1.315) (-1.331, -0.138)
t = -1.677 t = 2.287 t = -0.389 t = 1.994 t = -2.415
p = 0.094 p = 0.023 p = 0.698 p = 0.047 p = 0.016
ns(acousticness, df = 5)2 0.088 1.952*** 0.992** 1.408*** -0.679
(-0.725, 0.902) (1.092, 2.812) (0.213, 1.771) (0.540, 2.276) (-1.488, 0.131)
t = 0.213 t = 4.447 t = 2.496 t = 3.178 t = -1.644
p = 0.832 p = 0.00001 p = 0.013 p = 0.002 p = 0.101
ns(acousticness, df = 5)3 0.789 2.996*** 0.897 1.764* 0.714
(-1.069, 2.648) (1.159, 4.832) (-0.916, 2.709) (-0.073, 3.601) (-1.148, 2.577)
t = 0.832 t = 3.196 t = 0.970 t = 1.882 t = 0.752
p = 0.406 p = 0.002 p = 0.333 p = 0.060 p = 0.453
ns(acousticness, df = 5)4 2.313 6.312*** 4.696*** 4.490*** 0.575
(-0.934, 5.559) (3.016, 9.609) (1.491, 7.901) (1.202, 7.778) (-2.631, 3.780)
t = 1.396 t = 3.753 t = 2.872 t = 2.676 t = 0.352
p = 0.163 p = 0.0002 p = 0.005 p = 0.008 p = 0.726
ns(acousticness, df = 5)5 4.053 8.559*** 6.796** 4.960 3.406
(-2.377, 10.484) (2.170, 14.948) (0.427, 13.165) (-1.423, 11.342) (-2.995, 9.807)
t = 1.235 t = 2.626 t = 2.091 t = 1.523 t = 1.043
p = 0.217 p = 0.009 p = 0.037 p = 0.128 p = 0.298
ns(instrumentalness, df = 5)1 2.706*** 3.122*** 2.456*** 2.409*** 1.623***
(1.601, 3.811) (0.906, 5.339) (1.599, 3.314) (1.494, 3.325) (0.783, 2.462)
t = 4.798 t = 2.761 t = 5.615 t = 5.158 t = 3.789
p = 0.00001 p = 0.006 p = 0.00000 p = 0.00000 p = 0.0002
ns(instrumentalness, df = 5)2 0.420 1.009 0.426 1.276*** 1.541***
(-0.710, 1.549) (-1.166, 3.185) (-0.471, 1.323) (0.330, 2.222) (0.681, 2.402)
t = 0.728 t = 0.909 t = 0.931 t = 2.643 t = 3.511
p = 0.467 p = 0.364 p = 0.353 p = 0.009 p = 0.0005
ns(instrumentalness, df = 5)3 -1.719* -1.274 -2.293*** -0.952 0.201
(-3.739, 0.300) (-4.842, 2.295) (-3.638, -0.948) (-2.418, 0.515) (-0.832, 1.235)
t = -1.668 t = -0.700 t = -3.342 t = -1.271 t = 0.382
p = 0.096 p = 0.485 p = 0.001 p = 0.204 p = 0.703
ns(instrumentalness, df = 5)4 -0.317 -2.104** 0.781 0.750 1.451*
(-1.910, 1.276) (-3.983, -0.224) (-0.672, 2.234) (-0.727, 2.228) (-0.002, 2.903)
t = -0.390 t = -2.194 t = 1.053 t = 0.995 t = 1.957
p = 0.697 p = 0.029 p = 0.293 p = 0.320 p = 0.051
ns(instrumentalness, df = 5)5 -5.602*** -10.625** -2.221*** -3.478*** -1.274**
(-9.281, -1.922) (-20.737, -0.514) (-3.505, -0.936) (-5.431, -1.525) (-2.264, -0.284)
t = -2.984 t = -2.060 t = -3.389 t = -3.490 t = -2.523
p = 0.003 p = 0.040 p = 0.001 p = 0.0005 p = 0.012
ns(valence, df = 5)1 0.805* 2.694*** 1.543*** 2.508*** 3.812***
(-0.018, 1.627) (1.537, 3.850) (0.815, 2.271) (1.654, 3.361) (3.016, 4.609)
t = 1.918 t = 4.565 t = 4.153 t = 5.759 t = 9.382
p = 0.056 p = 0.00001 p = 0.00004 p = 0.000 p = 0.000
ns(valence, df = 5)2 1.780*** 4.623*** 2.218*** 3.910*** 5.041***
(0.685, 2.874) (3.173, 6.074) (1.217, 3.220) (2.766, 5.054) (3.959, 6.123)
t = 3.186 t = 6.247 t = 4.343 t = 6.699 t = 9.132
p = 0.002 p = 0.000 p = 0.00002 p = 0.000 p = 0.000
ns(valence, df = 5)3 2.593*** 3.997*** 2.160*** 4.032*** 5.857***
(1.707, 3.480) (2.982, 5.013) (1.305, 3.015) (3.122, 4.942) (4.963, 6.751)
t = 5.732 t = 7.714 t = 4.954 t = 8.679 t = 12.837
p = 0.000 p = 0.000 p = 0.00000 p = 0.000 p = 0.000
ns(valence, df = 5)4 3.192*** 9.336*** 4.080*** 7.466*** 10.348***
(1.150, 5.233) (6.546, 12.126) (2.257, 5.903) (5.352, 9.581) (8.369, 12.327)
t = 3.065 t = 6.559 t = 4.387 t = 6.920 t = 10.247
p = 0.003 p = 0.000 p = 0.00002 p = 0.000 p = 0.000
ns(valence, df = 5)5 2.656*** 5.742*** 3.328*** 5.128*** 7.803***
(1.542, 3.770) (4.656, 6.829) (2.252, 4.404) (4.034, 6.221) (6.731, 8.874)
t = 4.673 t = 10.360 t = 6.063 t = 9.188 t = 14.269
p = 0.00001 p = 0.000 p = 0.000 p = 0.000 p = 0.000
ns(tempo, df = 5)1 0.878 2.584* 1.363 1.302 3.620***
(-1.703, 3.460) (-0.240, 5.409) (-1.189, 3.915) (-1.258, 3.861) (0.961, 6.278)
t = 0.667 t = 1.794 t = 1.047 t = 0.997 t = 2.669
p = 0.505 p = 0.073 p = 0.296 p = 0.319 p = 0.008
ns(tempo, df = 5)2 -1.190 1.849 -1.449 -1.647 0.585
(-4.051, 1.670) (-1.287, 4.984) (-4.255, 1.357) (-4.474, 1.181) (-2.344, 3.514)
t = -0.815 t = 1.155 t = -1.012 t = -1.142 t = 0.391
p = 0.415 p = 0.248 p = 0.312 p = 0.254 p = 0.696
ns(tempo, df = 5)3 1.884** 2.459*** 0.673 0.237 1.529*
(0.352, 3.417) (0.817, 4.100) (-0.826, 2.173) (-1.314, 1.788) (-0.028, 3.087)
t = 2.410 t = 2.935 t = 0.880 t = 0.299 t = 1.924
p = 0.016 p = 0.004 p = 0.379 p = 0.765 p = 0.055
ns(tempo, df = 5)4 3.484 11.546*** 1.876 2.161 4.243
(-2.421, 9.390) (5.093, 18.000) (-3.924, 7.677) (-3.679, 8.001) (-1.838, 10.324)
t = 1.156 t = 3.507 t = 0.634 t = 0.725 t = 1.368
p = 0.248 p = 0.0005 p = 0.527 p = 0.469 p = 0.172
ns(tempo, df = 5)5 -2.830*** 1.493 -0.345 -0.951 -1.180
(-4.729, -0.930) (-0.415, 3.401) (-2.154, 1.463) (-2.893, 0.991) (-3.120, 0.760)
t = -2.920 t = 1.533 t = -0.374 t = -0.960 t = -1.192
p = 0.004 p = 0.126 p = 0.709 p = 0.338 p = 0.234
Constant -1.779 -6.723** 1.630 2.429 5.166**
(-6.936, 3.378) (-12.259, -1.187) (-3.138, 6.398) (-2.410, 7.268) (0.389, 9.943)
t = -0.676 t = -2.380 t = 0.670 t = 0.984 t = 2.119
p = 0.500 p = 0.018 p = 0.503 p = 0.326 p = 0.035
Akaike Inf. Crit. 14,849.350 14,849.350 14,849.350 14,849.350 14,849.350
Note: p<0.1; p<0.05; p<0.01
mean(predict(multi.log.ns, type="class", newdata=testSet)==testSet$genre)
## [1] 0.5623487
plotMultiROC <- function(model) {
  pred <- data.frame(predict(model, type="prob", newdata=testSet))
  colnames(pred) <- paste(levels(testSet$genre), "_pred_M")
  
  true_lable <- data.frame(dummy(testSet$genre))
  colnames(true_lable) <- paste(levels(testSet$genre), "_true")
  
  final <- cbind(true_lable, pred)
  
  roc_df <- plot_roc_data(multi_roc(final, force_diag = F)) %>%
    filter(Group != "Micro", Group != "Macro")
  pr_df <- plot_pr_data(multi_pr(final, force_diag = F))  %>%
    filter(Group != "Micro", Group != "Macro")
  
  return(ggplot(roc_df, aes(x=1-Specificity, y=Sensitivity)) +
    geom_path(aes(color=Group)) +
    geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), linetype = "dashed") +
    theme(plot.title = element_text(hjust = 0.5)) +
    scale_color_brewer(palette="Accent"))
}

plotMultiROC(multi.log.ns) + labs(title = "Logistic Regression w/ Natrual Spline ROC", 
                                  x = "FPR", y = "TPR")

plotCM <- function(model){
  return(plot_confusion_matrix(
    as_tibble(confusionMatrix(predict(model, type="class", newdata=testSet),
                              testSet$genre)$table), 
    target_col = "Reference", prediction_col = "Prediction", counts_col = "n"))
}

plotCM(multi.log.ns)

5.3 Tree based methods

5.3.1 Decision tree

dt <- rpart(genre ~ danceability + energy + mode + speechiness + 
              acousticness + instrumentalness + valence + tempo, data = trainSet)
rpart.plot(dt, type = 5, extra = 104,
           box.palette = list("#7fc97f", "#beaed4", "#fdc086",
                              "#ffff99", "#386cb0", "#f0027f"),
           leaf.round = 0, fallen.leaves = F, branch = 0.3, under = T, 
           under.col = 'grey40', main = 'Decision Tree', tweak = 1.2)

mean(predict(dt, type="class", newdata=testSet)==testSet$genre)
## [1] 0.4340194
plotMultiROC(dt) + labs(title = "Decision Tree ROC", x = "FPR", y = "TPR")

plotCM(dt)

5.3.2 Random forest

rf <- randomForest(as.factor(genre) ~ danceability + energy + mode + speechiness + 
              acousticness + instrumentalness + valence + tempo, data = trainSet, 
              ntree = 100, importance = TRUE)
mean(predict(rf, testSet)==testSet$genre)
## [1] 0.5702179
plotMultiROC(rf) + labs(title = "Random Forest ROC", x = "FPR", y = "TPR")

plotCM(rf)

5.3.3 Xgboost

matrix_train <- xgb.DMatrix(data = data.matrix(trainSet[,-1]), 
                            label = as.integer(as.factor(trainSet[,1]))-1
                            )
matrix_test <- xgb.DMatrix(data = data.matrix(testSet[,-1]), 
                           label = as.integer(as.factor(testSet[,1]))-1
                           )

xgb_class <- xgboost(data = matrix_train, 
                    nrounds = 50,
                    verbose = F,
                    params = list(objective = "multi:softmax",
                                  num_class = 6))

xgb_prob <- xgboost(data = matrix_train, 
                    nrounds = 50,
                    verbose = F,
                    params = list(objective = "multi:softprob",
                                  num_class = 6))
mean(predict(xgb_class, matrix_test)+1==as.integer(as.factor(testSet$genre)))
## [1] 0.5671913
pred_prob <- as.data.frame(matrix(predict(xgb_prob, newdata=matrix_test), 
                                  nrow=nrow(testSet), byrow = T))
colnames(pred_prob) <- paste(levels(testSet$genre), "_pred_XGB")

true_lable <- data.frame(dummy(testSet$genre))
colnames(true_lable) <- paste(levels(testSet$genre), "_true")

final <- cbind(true_lable, pred_prob)

roc_df <- plot_roc_data(multi_roc(final, force_diag = F)) %>%
  filter(Group != "Micro", Group != "Macro")
pr_df <- plot_pr_data(multi_pr(final, force_diag = F))  %>%
  filter(Group != "Micro", Group != "Macro")

ggplot(roc_df, aes(x=1-Specificity, y=Sensitivity)) +
  geom_path(aes(color=Group)) +
  geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), linetype = "dashed") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_brewer(palette="Accent") + 
  labs(title = "Gradient Boost ROC", x = "FPR", y = "TPR")

# map predicted class from integer to factor
pred_class <- as.integer(predict(xgb_class, matrix_test) + 1)
map <- mapLevels(x=levels(trainSet$genre))
mapLevels(x=pred_class) <- map

plot_confusion_matrix(as_tibble(confusionMatrix(pred_class, testSet$genre)$table), 
                      target_col = "Reference", prediction_col = "Prediction", 
                      counts_col = "n")

5.4 SVM

5.4.1 Linear kernel

svm.linear <- svm(as.factor(genre) ~ danceability + energy + mode + speechiness + 
              acousticness + instrumentalness + valence + tempo, 
              data = trainSet, kernel = "linear", cost = 10)

mean(predict(svm.linear, testSet)==testSet$genre)
## [1] 0.5175545

5.4.2 Polynomial kernel

svm.poly <- svm(as.factor(genre) ~ danceability + energy + mode + speechiness + 
              acousticness + instrumentalness + valence + tempo, 
              data = trainSet, kernel = "poly", cost = 10, probability = T)

mean(predict(svm.poly, testSet)==testSet$genre)
## [1] 0.5435835
svm_pred <- predict(svm.poly, testSet, probability = T)

pred_prob <- as.data.frame(attr(svm_pred, "probabilities"))
colnames(pred_prob) <- paste(colnames(pred_prob), "_pred_SVM")

true_lable <- data.frame(dummy(testSet$genre))
colnames(true_lable) <- paste(levels(testSet$genre), "_true")

final <- cbind(true_lable, pred_prob)

roc_df <- plot_roc_data(multi_roc(final, force_diag = F)) %>%
  filter(Group != "Micro", Group != "Macro")
pr_df <- plot_pr_data(multi_pr(final, force_diag = F))  %>%
  filter(Group != "Micro", Group != "Macro")

ggplot(roc_df, aes(x=1-Specificity, y=Sensitivity)) +
  geom_path(aes(color=Group)) +
  geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), linetype = "dashed") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_brewer(palette="Accent") + 
  labs(title = "Poly SVM ROC", x = "FPR", y = "TPR")

plotCM(svm.poly)

5.5 Comparison

5.5.1 Feature importance

# importance
dt_imp <- as.data.frame(dt$variable)
dt_imp$feature <- rownames(dt_imp)
# mean decrease in impurity
rf_imp <- as.data.frame(randomForest::importance(rf, 2))
rf_imp$feature <- rownames(rf_imp)
# gain
xgb_imp <- xgb.importance(model = xgb_class) %>% 
  select(Feature, Gain)

imp <- dplyr::left_join(dt_imp, rf_imp, by = "feature") %>% 
  dplyr::left_join(xgb_imp, by = c("feature" = "Feature")) %>%
  rename('Xgboost' = 'Gain',
         'Decision tree' = 'dt$variable',
         'Random forest' = 'MeanDecreaseGini') 

# scale importance for comparison
imp %>%
  mutate_if(is.numeric, scale, center = TRUE) %>%
  pivot_longer(cols = c('Xgboost', 'Decision tree', 'Random forest')) %>%
  rename('model' = 'name') %>%
  ggplot(aes(x = reorder(feature, value, mean, na.rm = TRUE), 
             y = value, color = model)) + 
  geom_point(size = 2) + 
  coord_flip() +
  labs(title = 'Scaled Variable Importance by Model',
       y = 'Scaled value', x = '') +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_brewer(palette = 'Set2')

5.5.2 Accuracy

data.frame(model = 'Logistic regression', actual = testSet$genre,
                   predicted = predict(multi.log.ns, testSet, type = "class"),
                   stringsAsFactors = FALSE) %>% 
  rbind(data.frame(model = 'Decisoin tree', actual = testSet$genre,
                   predicted = predict(dt, testSet, type = "class"),
                   stringsAsFactors = FALSE)) %>% 
  rbind(data.frame(model = 'Random forest', actual = testSet$genre,
                   predicted = predict(rf, testSet, type = "class"),
                   stringsAsFactors = FALSE)) %>% 
  rbind(data.frame(model = 'Xgboost', actual = testSet$genre,
                   predicted = pred_class,
                   stringsAsFactors = FALSE)) %>% 
  rbind(data.frame(model = 'SVM', actual = testSet$genre,
                   predicted = predict(svm.poly, testSet),
                   stringsAsFactors = FALSE)) %>% 
  count(actual, predicted, model) %>%
  mutate(match = ifelse(actual == predicted, TRUE, FALSE)) %>%
  group_by(actual, model) %>%
  mutate(pct = n/sum(n)) %>% 
  ungroup() %>%
  mutate(label = ifelse(match == TRUE, 
                        paste0(round(pct * 100,1),'%'), 
                        ""),
         model = factor(model, levels = c('Decisoin tree','Random forest','Xgboost',
                                          'Logistic regression','SVM'))) %>%
  ggplot(aes(x = actual, 
             y = pct, 
             fill = predicted, 
             label = label)) +
  geom_col(position = 'dodge') +
  geom_text(position = position_dodge(width = 1), 
            cex = 2.75, 
            hjust = -0.1) +
  facet_wrap(~ model, ncol = 3) +
  coord_flip() + 
  labs(title = 'Genre Accuracy by Model',
       x= "",
       y = 'Percent classified',
       fill = "") +
  ylim(c(0,.85)) +
  theme(panel.grid.major.y = element_blank(), plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Accent")